library(faraway)
attach(teengamb)                
data(teengamb)
head(teengamb)
##   sex status income verbal gamble
## 1   1     51   2.00      8    0.0
## 2   1     28   2.50      8    0.0
## 3   1     37   2.00      6    0.0
## 4   1     28   7.00      4    7.3
## 5   1     65   2.00      8   19.6
## 6   1     61   3.47      6    0.1
summary(teengamb)
##       sex             status          income           verbal     
##  Min.   :0.0000   Min.   :18.00   Min.   : 0.600   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.: 6.00  
##  Median :0.0000   Median :43.00   Median : 3.250   Median : 7.00  
##  Mean   :0.4043   Mean   :45.23   Mean   : 4.642   Mean   : 6.66  
##  3rd Qu.:1.0000   3rd Qu.:61.50   3rd Qu.: 6.210   3rd Qu.: 8.00  
##  Max.   :1.0000   Max.   :75.00   Max.   :15.000   Max.   :10.00  
##      gamble     
##  Min.   :  0.0  
##  1st Qu.:  1.1  
##  Median :  6.0  
##  Mean   : 19.3  
##  3rd Qu.: 19.4  
##  Max.   :156.0

Sex is a categorical variable

teengamb$sex <-as.factor(teengamb$sex) 

A numerical summary divided by sex is useful for understanding patterns if present

by(teengamb, teengamb$sex, summary)
## teengamb$sex: 0
##  sex        status          income           verbal           gamble       
##  0:28   Min.   :18.00   Min.   : 0.600   Min.   : 1.000   Min.   :  0.000  
##  1: 0   1st Qu.:38.00   1st Qu.: 2.000   1st Qu.: 6.000   1st Qu.:  2.775  
##         Median :51.00   Median : 3.375   Median : 7.000   Median : 14.250  
##         Mean   :52.00   Mean   : 4.976   Mean   : 6.821   Mean   : 29.775  
##         3rd Qu.:65.25   3rd Qu.: 6.625   3rd Qu.: 8.250   3rd Qu.: 42.175  
##         Max.   :75.00   Max.   :15.000   Max.   :10.000   Max.   :156.000  
## ------------------------------------------------------------ 
## teengamb$sex: 1
##  sex        status          income           verbal          gamble      
##  0: 0   Min.   :18.00   Min.   : 1.500   Min.   :4.000   Min.   : 0.000  
##  1:19   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.:6.000   1st Qu.: 0.100  
##         Median :30.00   Median : 3.000   Median :6.000   Median : 1.700  
##         Mean   :35.26   Mean   : 4.149   Mean   :6.421   Mean   : 3.866  
##         3rd Qu.:43.00   3rd Qu.: 5.750   3rd Qu.:8.000   3rd Qu.: 6.000  
##         Max.   :65.00   Max.   :10.000   Max.   :8.000   Max.   :19.600
pairs(teengamb) 

cor(teengamb[,-1])
##             status     income     verbal      gamble
## status  1.00000000 -0.2750340  0.5316102 -0.05042081
## income -0.27503402  1.0000000 -0.1755707  0.62207690
## verbal  0.53161022 -0.1755707  1.0000000 -0.22005619
## gamble -0.05042081  0.6220769 -0.2200562  1.00000000

We remove the qualitative variable were correlations cannot be evaluated.

Female

cor(teengamb[teengamb$sex==1,-1])  
##            status     income      verbal     gamble
## status  1.0000000 -0.4870717  0.33460676 0.36090977
## income -0.4870717  1.0000000 -0.21463814 0.08823560
## verbal  0.3346068 -0.2146381  1.00000000 0.07068478
## gamble  0.3609098  0.0882356  0.07068478 1.00000000

Male

cor(teengamb[teengamb$sex==0,-1]) 
##            status     income     verbal     gamble
## status  1.0000000 -0.3454141  0.6296585 -0.3997019
## income -0.3454141  1.0000000 -0.1837980  0.7136690
## verbal  0.6296585 -0.1837980  1.0000000 -0.3325610
## gamble -0.3997019  0.7136690 -0.3325610  1.0000000

Graphical summary

plot(gamble~sex, teengamb) 

We can identify outliers

plot(income~sex)     

plot(gamble~income, pch=unclass(sex))  

data(uswages)
attach(uswages)                
head(uswages)
##         wage educ exper race smsa ne mw so we pt
## 6085  771.60   18    18    0    1  1  0  0  0  0
## 23701 617.28   15    20    0    1  0  0  0  1  0
## 16208 957.83   16     9    0    1  0  0  1  0  0
## 2720  617.28   12    24    0    1  1  0  0  0  0
## 9723  902.18   14    12    0    1  0  1  0  0  0
## 22239 299.15   12    33    0    1  0  0  0  1  0
summary(uswages)
##       wage              educ           exper            race      
##  Min.   :  50.39   Min.   : 0.00   Min.   :-2.00   Min.   :0.000  
##  1st Qu.: 308.64   1st Qu.:12.00   1st Qu.: 8.00   1st Qu.:0.000  
##  Median : 522.32   Median :12.00   Median :15.00   Median :0.000  
##  Mean   : 608.12   Mean   :13.11   Mean   :18.41   Mean   :0.078  
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00   3rd Qu.:0.000  
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00   Max.   :1.000  
##       smsa             ne              mw               so        
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.756   Mean   :0.229   Mean   :0.2485   Mean   :0.3125  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##        we             pt        
##  Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.00   1st Qu.:0.0000  
##  Median :0.00   Median :0.0000  
##  Mean   :0.21   Mean   :0.0925  
##  3rd Qu.:0.00   3rd Qu.:0.0000  
##  Max.   :1.00   Max.   :1.0000

Change categorical variables

uswages$race <-  as.factor(uswages$race) 
uswages$smsa <-  as.factor(uswages$smsa) 
uswages$ne <-  as.factor(uswages$ne) 
uswages$mw <-  as.factor(uswages$mw) 
uswages$so <-  as.factor(uswages$so) 
uswages$we <-  as.factor(uswages$we) 
uswages$pt <-  as.factor(uswages$pt) 

A numerical summary divided by race is useful for understanding patterns if present

by(uswages, uswages$race, summary)
## uswages$race: 0
##       wage              educ          exper       race     smsa     ne      
##  Min.   :  50.39   Min.   : 0.0   Min.   :-2.00   0:1844   0: 459   0:1408  
##  1st Qu.: 315.81   1st Qu.:12.0   1st Qu.: 8.00   1:   0   1:1385   1: 436  
##  Median : 522.32   Median :12.0   Median :15.00                             
##  Mean   : 620.98   Mean   :13.2   Mean   :18.29                             
##  3rd Qu.: 795.59   3rd Qu.:16.0   3rd Qu.:27.00                             
##  Max.   :7716.05   Max.   :18.0   Max.   :59.00                             
##  mw       so       we       pt      
##  0:1373   0:1314   0:1437   0:1677  
##  1: 471   1: 530   1: 407   1: 167  
##                                     
##                                     
##                                     
##                                     
## ------------------------------------------------------------ 
## uswages$race: 1
##       wage              educ           exper       race    smsa    ne     
##  Min.   :  52.23   Min.   : 0.00   Min.   :-1.00   0:  0   0: 29   0:134  
##  1st Qu.: 237.42   1st Qu.:11.75   1st Qu.: 9.75   1:156   1:127   1: 22  
##  Median : 398.46   Median :12.00   Median :17.50                          
##  Mean   : 456.04   Mean   :12.11   Mean   :19.83                          
##  3rd Qu.: 641.03   3rd Qu.:14.00   3rd Qu.:27.00                          
##  Max.   :2374.15   Max.   :18.00   Max.   :58.00                          
##  mw      so     we      pt     
##  0:130   0:61   0:143   0:138  
##  1: 26   1:95   1: 13   1: 18  
##                                
##                                
##                                
## 
pairs(uswages[,-c(4:10)]) 

cor(uswages[,-c(4:10)])
##            wage       educ      exper
## wage  1.0000000  0.2483358  0.1832012
## educ  0.2483358  1.0000000 -0.3024788
## exper 0.1832012 -0.3024788  1.0000000

We remove the qualitative variables were correlations cannot be evaluated.

Black

cor(uswages[uswages$race==1,-c(4:10)])  
##            wage       educ      exper
## wage  1.0000000  0.1869139  0.1398199
## educ  0.1869139  1.0000000 -0.4211823
## exper 0.1398199 -0.4211823  1.0000000

White

cor(uswages[uswages$race==0,-c(4:10)]) 
##            wage       educ      exper
## wage  1.0000000  0.2445087  0.1909131
## educ  0.2445087  1.0000000 -0.2918642
## exper 0.1909131 -0.2918642  1.0000000

Graphical summary

plot(educ~race, uswages)

plot(exper~race, uswages)

plot(wage~race, uswages)

We can identify outliers

plot(wage~educ, pch=unclass(race)) 

plot(wage~exper, pch=unclass(race)) 

plot(educ~exper, pch=unclass(race)) 

data(prostate)
attach(prostate)                
head(prostate)
##       lcavol lweight age      lbph svi      lcp gleason pgg45     lpsa
## 1 -0.5798185  2.7695  50 -1.386294   0 -1.38629       6     0 -0.43078
## 2 -0.9942523  3.3196  58 -1.386294   0 -1.38629       6     0 -0.16252
## 3 -0.5108256  2.6912  74 -1.386294   0 -1.38629       7    20 -0.16252
## 4 -1.2039728  3.2828  58 -1.386294   0 -1.38629       6     0 -0.16252
## 5  0.7514161  3.4324  62 -1.386294   0 -1.38629       6     0  0.37156
## 6 -1.0498221  3.2288  50 -1.386294   0 -1.38629       6     0  0.76547
summary(prostate)
##      lcavol           lweight           age             lbph        
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
##  1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
##  Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
##  Mean   : 1.3500   Mean   :3.653   Mean   :63.87   Mean   : 0.1004  
##  3rd Qu.: 2.1270   3rd Qu.:3.878   3rd Qu.:68.00   3rd Qu.: 1.5581  
##  Max.   : 3.8210   Max.   :6.108   Max.   :79.00   Max.   : 2.3263  
##       svi              lcp             gleason          pgg45       
##  Min.   :0.0000   Min.   :-1.3863   Min.   :6.000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.:  0.00  
##  Median :0.0000   Median :-0.7985   Median :7.000   Median : 15.00  
##  Mean   :0.2165   Mean   :-0.1794   Mean   :6.753   Mean   : 24.38  
##  3rd Qu.:0.0000   3rd Qu.: 1.1786   3rd Qu.:7.000   3rd Qu.: 40.00  
##  Max.   :1.0000   Max.   : 2.9042   Max.   :9.000   Max.   :100.00  
##       lpsa        
##  Min.   :-0.4308  
##  1st Qu.: 1.7317  
##  Median : 2.5915  
##  Mean   : 2.4784  
##  3rd Qu.: 3.0564  
##  Max.   : 5.5829

Change categorical variables

prostate$svi <-  as.factor(prostate$svi) 

A numerical summary divided by svi is useful for understanding patterns if present

by(prostate, prostate$svi, summary)
## prostate$svi: 0
##      lcavol           lweight           age             lbph         svi   
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863   0:76  
##  1st Qu.: 0.3602   1st Qu.:3.310   1st Qu.:60.00   1st Qu.:-1.3863   1: 0  
##  Median : 1.1616   Median :3.554   Median :64.00   Median : 0.4383         
##  Mean   : 1.0179   Mean   :3.624   Mean   :63.41   Mean   : 0.1655         
##  3rd Qu.: 1.6844   3rd Qu.:3.866   3rd Qu.:68.00   3rd Qu.: 1.6438         
##  Max.   : 3.2465   Max.   :6.108   Max.   :78.00   Max.   : 2.3263         
##       lcp             gleason          pgg45            lpsa        
##  Min.   :-1.3863   Min.   :6.000   Min.   : 0.00   Min.   :-0.4308  
##  1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.: 0.00   1st Qu.: 1.5891  
##  Median :-1.3863   Median :7.000   Median : 5.00   Median : 2.2345  
##  Mean   :-0.6715   Mean   :6.632   Mean   :17.63   Mean   : 2.1366  
##  3rd Qu.:-0.3637   3rd Qu.:7.000   3rd Qu.:26.25   3rd Qu.: 2.8079  
##  Max.   : 2.3273   Max.   :9.000   Max.   :95.00   Max.   : 4.0298  
## ------------------------------------------------------------ 
## prostate$svi: 1
##      lcavol         lweight           age             lbph         svi   
##  Min.   :1.215   Min.   :3.237   Min.   :44.00   Min.   :-1.3863   0: 0  
##  1st Qu.:1.997   1st Qu.:3.582   1st Qu.:62.00   1st Qu.:-1.3863   1:21  
##  Median :2.661   Median :3.774   Median :68.00   Median :-0.5276         
##  Mean   :2.552   Mean   :3.755   Mean   :65.52   Mean   :-0.1353         
##  3rd Qu.:2.907   3rd Qu.:3.897   3rd Qu.:69.00   3rd Qu.: 1.3481         
##  Max.   :3.821   Max.   :4.718   Max.   :79.00   Max.   : 2.0082         
##       lcp            gleason         pgg45             lpsa      
##  Min.   :-1.386   Min.   :7.00   Min.   : 10.00   Min.   :2.214  
##  1st Qu.: 1.179   1st Qu.:7.00   1st Qu.: 30.00   1st Qu.:3.056  
##  Median : 1.910   Median :7.00   Median : 50.00   Median :3.565  
##  Mean   : 1.602   Mean   :7.19   Mean   : 48.81   Mean   :3.715  
##  3rd Qu.: 2.420   3rd Qu.:7.00   3rd Qu.: 60.00   3rd Qu.:4.130  
##  Max.   : 2.904   Max.   :9.00   Max.   :100.00   Max.   :5.583
pairs(prostate[,-5]) 

cor(prostate[,-5])
##             lcavol      lweight       age        lbph         lcp      gleason
## lcavol  1.00000000  0.194128387 0.2249999  0.02734971  0.67531058  0.432417052
## lweight 0.19412839  1.000000000 0.3075247  0.43493174  0.10023889 -0.001283003
## age     0.22499988  0.307524741 1.0000000  0.35018592  0.12766778  0.268891599
## lbph    0.02734971  0.434931744 0.3501859  1.00000000 -0.00699944  0.077820444
## lcp     0.67531058  0.100238891 0.1276678 -0.00699944  1.00000000  0.514829912
## gleason 0.43241705 -0.001283003 0.2688916  0.07782044  0.51482991  1.000000000
## pgg45   0.43365224  0.050846195 0.2761124  0.07846000  0.63152807  0.751904512
## lpsa    0.73446028  0.354121818 0.1695929  0.17980950  0.54881316  0.368986693
##             pgg45      lpsa
## lcavol  0.4336522 0.7344603
## lweight 0.0508462 0.3541218
## age     0.2761124 0.1695929
## lbph    0.0784600 0.1798095
## lcp     0.6315281 0.5488132
## gleason 0.7519045 0.3689867
## pgg45   1.0000000 0.4223157
## lpsa    0.4223157 1.0000000

We remove the qualitative variables were correlations cannot be evaluated.

With seminal vesicle invasion

cor(prostate[prostate$svi==1,-5])  
##             lcavol       lweight         age        lbph         lcp    gleason
## lcavol   1.0000000  0.2016764497 -0.24688623 -0.14899899  0.53983778 -0.1330999
## lweight  0.2016764  1.0000000000  0.23071571  0.08582420  0.19705639 -0.1409462
## age     -0.2468862  0.2307157098  1.00000000  0.33635108 -0.03212035  0.1679336
## lbph    -0.1489990  0.0858242041  0.33635108  1.00000000 -0.05658159  0.3176501
## lcp      0.5398378  0.1970563913 -0.03212035 -0.05658159  1.00000000  0.1004554
## gleason -0.1330999 -0.1409461563  0.16793362  0.31765006  0.10045543  1.0000000
## pgg45   -0.1155055 -0.0003723927  0.31423433  0.29951832  0.27429066  0.5967197
## lpsa     0.4721634  0.0122011324 -0.31466981 -0.18892183  0.06489636 -0.3210767
##                 pgg45        lpsa
## lcavol  -0.1155055281  0.47216337
## lweight -0.0003723927  0.01220113
## age      0.3142343310 -0.31466981
## lbph     0.2995183241 -0.18892183
## lcp      0.2742906649  0.06489636
## gleason  0.5967197325 -0.32107670
## pgg45    1.0000000000 -0.22100690
## lpsa    -0.2210068987  1.00000000

Without seminal vesicle invasion

cor(prostate[prostate$svi==0,-5]) 
##            lcavol      lweight       age       lbph        lcp     gleason
## lcavol  1.0000000  0.157723389 0.2768497 0.12484824 0.50355027  0.39252917
## lweight 0.1577234  1.000000000 0.3176750 0.50781401 0.01090040 -0.02433415
## age     0.2768497  0.317675007 1.0000000 0.37245238 0.09440620  0.26610078
## lbph    0.1248482  0.507814006 0.3724524 1.00000000 0.09986765  0.06892492
## lcp     0.5035503  0.010900396 0.0944062 0.09986765 1.00000000  0.50262560
## gleason 0.3925292 -0.024334155 0.2661008 0.06892492 0.50262560  1.00000000
## pgg45   0.3169860  0.001492569 0.2333695 0.09129667 0.55197656  0.74941027
## lpsa    0.6495756  0.416117460 0.2472757 0.38765367 0.32929037  0.36236698
##               pgg45      lpsa
## lcavol  0.316985985 0.6495756
## lweight 0.001492569 0.4161175
## age     0.233369469 0.2472757
## lbph    0.091296671 0.3876537
## lcp     0.551976564 0.3292904
## gleason 0.749410267 0.3623670
## pgg45   1.000000000 0.3392641
## lpsa    0.339264111 1.0000000

Graphical summary

plot(age~svi, prostate)

plot(lweight~svi, prostate)

plot(lweight~age, prostate)

plot(lcavol~age, prostate)

data(sat)
attach(sat)
## The following object is masked from teengamb:
## 
##     verbal
summary(sat)
##      expend          ratio           salary          takers     
##  Min.   :3.656   Min.   :13.80   Min.   :25.99   Min.   : 4.00  
##  1st Qu.:4.882   1st Qu.:15.22   1st Qu.:30.98   1st Qu.: 9.00  
##  Median :5.768   Median :16.60   Median :33.29   Median :28.00  
##  Mean   :5.905   Mean   :16.86   Mean   :34.83   Mean   :35.24  
##  3rd Qu.:6.434   3rd Qu.:17.57   3rd Qu.:38.55   3rd Qu.:63.00  
##  Max.   :9.774   Max.   :24.30   Max.   :50.05   Max.   :81.00  
##      verbal           math           total       
##  Min.   :401.0   Min.   :443.0   Min.   : 844.0  
##  1st Qu.:427.2   1st Qu.:474.8   1st Qu.: 897.2  
##  Median :448.0   Median :497.5   Median : 945.5  
##  Mean   :457.1   Mean   :508.8   Mean   : 965.9  
##  3rd Qu.:490.2   3rd Qu.:539.5   3rd Qu.:1032.0  
##  Max.   :516.0   Max.   :592.0   Max.   :1107.0
pairs(sat)

cor(sat)
##            expend        ratio       salary     takers      verbal        math
## expend  1.0000000 -0.371025386  0.869801513  0.5926274 -0.41004987 -0.34941409
## ratio  -0.3710254  1.000000000 -0.001146081 -0.2130536  0.06376664  0.09542173
## salary  0.8698015 -0.001146081  1.000000000  0.6167799 -0.47696364 -0.40131282
## takers  0.5926274 -0.213053607  0.616779867  1.0000000 -0.89326296 -0.86938393
## verbal -0.4100499  0.063766636 -0.476963635 -0.8932630  1.00000000  0.97025604
## math   -0.3494141  0.095421730 -0.401312817 -0.8693839  0.97025604  1.00000000
## total  -0.3805370  0.081253823 -0.439883381 -0.8871187  0.99150325  0.99350238
##              total
## expend -0.38053700
## ratio   0.08125382
## salary -0.43988338
## takers -0.88711868
## verbal  0.99150325
## math    0.99350238
## total   1.00000000

High correlation between salary vs expenditure, verbal vs math, verbal vs total, math vs total

we can observe those strong correlation garaphically

plot(expend~salary)      

plot(verbal~math)      

plot(verbal~total)      

plot(math~total)      

data(divusa)
attach(divusa)
head(divusa)
##   year divorce unemployed femlab marriage birth military
## 1 1920     8.0        5.2  22.70     92.0 117.9   3.2247
## 2 1921     7.2       11.7  22.79     83.0 119.8   3.5614
## 3 1922     6.6        6.7  22.88     79.7 111.2   2.4553
## 4 1923     7.1        2.4  22.97     85.2 110.5   2.2065
## 5 1924     7.2        5.0  23.06     80.3 110.9   2.2889
## 6 1925     7.2        3.2  23.15     79.2 106.6   2.1735

Numerical summary

summary(divusa)
##       year         divorce        unemployed         femlab     
##  Min.   :1920   Min.   : 6.10   Min.   : 1.200   Min.   :22.70  
##  1st Qu.:1939   1st Qu.: 8.70   1st Qu.: 4.200   1st Qu.:27.47  
##  Median :1958   Median :10.60   Median : 5.600   Median :37.10  
##  Mean   :1958   Mean   :13.27   Mean   : 7.173   Mean   :38.58  
##  3rd Qu.:1977   3rd Qu.:20.30   3rd Qu.: 7.500   3rd Qu.:47.80  
##  Max.   :1996   Max.   :22.80   Max.   :24.900   Max.   :59.30  
##     marriage          birth           military     
##  Min.   : 49.70   Min.   : 65.30   Min.   : 1.940  
##  1st Qu.: 61.90   1st Qu.: 68.90   1st Qu.: 3.469  
##  Median : 74.10   Median : 85.90   Median : 9.102  
##  Mean   : 72.97   Mean   : 88.89   Mean   :12.365  
##  3rd Qu.: 80.00   3rd Qu.:107.30   3rd Qu.:14.266  
##  Max.   :118.10   Max.   :122.90   Max.   :86.641
cor(divusa)
##                    year     divorce unemployed      femlab   marriage
## year        1.000000000  0.87923753 -0.2344792  0.98598207 -0.6173255
## divorce     0.879237534  1.00000000 -0.2106019  0.91039698 -0.5342554
## unemployed -0.234479195 -0.21060188  1.0000000 -0.25746176 -0.2707630
## femlab      0.985982068  0.91039698 -0.2574618  1.00000000 -0.6486273
## marriage   -0.617325533 -0.53425537 -0.2707630 -0.64862728  1.0000000
## birth      -0.576313991 -0.72192425 -0.3138890 -0.60409490  0.6737273
## military    0.007267171  0.01857483 -0.4002930  0.05126339  0.2581983
##                 birth     military
## year       -0.5763140  0.007267171
## divorce    -0.7219242  0.018574832
## unemployed -0.3138890 -0.400292954
## femlab     -0.6040949  0.051263390
## marriage    0.6737273  0.258198260
## birth       1.0000000  0.140898643
## military    0.1408986  1.000000000

Graphical summary

pairs(divusa) 

divusa$year<-as.character(divusa$year)
plot(marriage~year,divusa)     

plot(femlab~year,divusa)